In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
Q1 (10 points)
The heart dataframe contains the survival time after receiving a heart transplant, the age of the patient and whether or not the survival time was censored
Variable name definitions::
Answer the following questions with respect to the heart data set:
In [2]:
heart = sm.datasets.heart.load_pandas().data
heart.head(n=6)
Out[2]:
In [3]:
heart.sort_values('age', ascending=False, inplace=True)
heart.head()
Out[3]:
In [4]:
sum(heart.censors == 0)
Out[4]:
In [5]:
heart.loc[(heart.censors == 1) & (heart.age < 45), 'age'].mean()
Out[5]:
In [6]:
heart.groupby(['censors']).agg(['mean', 'std'])
Out[6]:
In [7]:
import seaborn as sns
sns.lmplot(data=heart, x='age', y='survival', hue='censors')
pass
Q2 (10 points)
Write a flatmap function that works like map except that the function given takes a list and returns a list of lists that is then flattened (4 points).
In other words, flatmap takes two arguments, a function and a list (or other iterable), just like map. Howevver the function given as the first agument takes a single argument and returns a list (or ohter iterable). In order to get a simple list back, we need to unravel the reuslting list of lists, hence the flatten part.
For example,
flatmap(lambda x: x.split(), ["hello world", "the quick dog"])
should return
["hello", "world", "the", "quick", "dog"]
In [8]:
def flatten(list_of_lists):
"""Flatten a list of lists."""
for alist in list_of_lists:
for item in alist:
yield item
def flatmap(f, xs):
"""First map, then flatten result."""
return flatten(map(f, xs))
In [9]:
list(flatmap(lambda x: x.split(), ["hello world", "the quick dog"]))
Out[9]:
Q3 (10 points)
An affine transformation of a vector $x$ is the operation $Ax + b$, where $A$ is a matrix and $b$ is a vector.
In [10]:
def affine(x, A, b):
"""Affine transofrm."""
return A@x + b
In [11]:
def rev_affine(y, A, b):
"""Reverse affine transform."""
return np.linalg.solve(A, (y - b))
In [12]:
A = np.random.random((3,3,))
b = np.random.random(3)
x = np.random.random(3)
In [13]:
x
Out[13]:
In [14]:
y = affine(x, A, b)
y
Out[14]:
In [15]:
rev_affine(y, A, b)
Out[15]:
Q4 (10 points)
You are given the following DNA sequecne in FASTA format.
dna = '''> A simulated DNA sequence.
TTAGGCAGTAACCCCGCGATAGGTAGAGCACGCAATCGTCAAGGCGTGCGGTAGGGCTTCCGTGTCTTACCCAAAGAAAC
GACGTAACGTTCCCCGGGCGGTTAAACCAAATCCACTTCACCAACGGCATAACGCGAAGCCCAAACTAAATCGCGCTCGA
GCGGACGCACATTCGCTAGGCTGTGTAGGGGCAGTCTCCGTTAAGGACGATTACCACGTGATGGTAGTTCGCAACATTGG
ACTGTCGGGAATTCCCGAAGGCACTTAAGCGGAGTCTTAGCGTACAGTAACGCAGTCCCGCGTGAACGACTGACAGATGA
'''
In [16]:
dna = '''> A simulated DNA sequence.
TTAGGCAGTAACCCCGCGATAGGTAGAGCACGCAATCGTCAAGGCGTGCGGTAGGGCTTCCGTGTCTTACCCAAAGAAAC
GACGTAACGTTCCCCGGGCGGTTAAACCAAATCCACTTCACCAACGGCATAACGCGAAGCCCAAACTAAATCGCGCTCGA
GCGGACGCACATTCGCTAGGCTGTGTAGGGGCAGTCTCCGTTAAGGACGATTACCACGTGATGGTAGTTCGCAACATTGG
ACTGTCGGGAATTCCCGAAGGCACTTAAGCGGAGTCTTAGCGTACAGTAACGCAGTCCCGCGTGAACGACTGACAGATGA
'''
dna = ''.join(dna.split('\n')[1:])
dna
Out[16]:
In [17]:
from collections import Counter
c = Counter(zip(dna[:-1], dna[1:]))
c
Out[17]:
In [18]:
sum(c.values())
Out[18]:
Q5 (10 points)
The code given below performs a stochastic gradient descent to fit a quadratic polynomila to $n$ data points. Maake the code run faster by:
numba JIT CythonSome test code is provided. Please run this for your optimized versios to confirm that they give the same resuls as the orignal code.
In [19]:
def sgd(b, x, y, max_iter, alpha):
n = x.shape[0]
for i in range(max_iter):
for j in range(n):
b[0] -= alpha * (2*(b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
b[1] =- alpha * (2*x[j] * (b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
b[2] -= alpha * (2*x[j]**2 * (b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
return b
In [20]:
%%time
np.random.seed(12345)
n = 10000
x = np.linspace(0, 10, n)
y = 2*x**2 + 6*x + 3 + np.random.normal(0, 5, n)
k = 100
alpha = 0.00001
b0 = np.random.random(3)
b = sgd(b0, x, y, k, alpha)
yhat = b[0] + b[1]*x+ b[2]*x**2
idx = sorted(np.random.choice(n, 100))
plt.scatter(x[idx], y[idx])
plt.plot(x[idx], yhat[idx], c='red')
pass
In [21]:
from numba import jit
@jit(nopython = True)
def sgd_numba(b, x, y, max_iter, alpha):
n = x.shape[0]
for i in range(max_iter):
for j in range(n):
b[0] -= alpha * (2*(b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
b[1] =- alpha * (2*x[j] * (b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
b[2] -= alpha * (2*x[j]**2 * (b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
return b
In [22]:
# Call once to trigger compilation
sgd_numba(b0, x, y, k, alpha);
In [23]:
%%time
np.random.seed(12345)
n = 10000
x = np.linspace(0, 10, n)
y = 2*x**2 + 6*x + 3 + np.random.normal(0, 5, n)
k = 100
alpha = 0.00001
b0 = np.random.random(3)
b = sgd_numba(b0, x, y, k, alpha)
yhat = b[0] + b[1]*x+ b[2]*x**2
idx = sorted(np.random.choice(n, 100))
plt.scatter(x[idx], y[idx])
plt.plot(x[idx], yhat[idx], c='red')
pass
In [24]:
%load_ext cython
In [25]:
%%cython -a
cimport cython
@cython.wraparound(False)
@cython.boundscheck(False)
def sgd_cython(double[:] b, double[:] x, double[:] y, int max_iter, double alpha):
cdef int n = x.shape[0]
cdef int i, j
for i in range(max_iter):
for j in range(n):
b[0] -= alpha * (2*(b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
b[1] =- alpha * (2*x[j] * (b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
b[2] -= alpha * (2*x[j]**2 * (b[0] + b[1]*x[j] + b[2]*x[j]**2 - y[j]))
return b
Out[25]:
In [26]:
%%time
np.random.seed(12345)
n = 10000
x = np.linspace(0, 10, n)
y = 2*x**2 + 6*x + 3 + np.random.normal(0, 5, n)
k = 100
alpha = 0.00001
b0 = np.random.random(3)
b = sgd_cython(b0, x, y, k, alpha)
yhat = b[0] + b[1]*x+ b[2]*x**2
idx = sorted(np.random.choice(n, 100))
plt.scatter(x[idx], y[idx])
plt.plot(x[idx], yhat[idx], c='red')
pass
In [ ]: